/* xtea-asm.S */
/*
    This file is part of the AVR-Crypto-Lib.
    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
/* xtea-asm.S 
 * Author:      Daniel Otte
 * Date:        2006-06-06
 * License:     GPLv3 or later
 *  Implementation of XTEA for AVR
 *  include xtea.h in your C-Project to use this functions.
*/

V01 = 2
V02 = 3
V03 = 4
V04 = 5
V11 = 6
V12 = 7
V13 = 8
V14 = 9
Accu1 = 14
Accu2 = 15
Accu3 = 16
Accu4 = 17
Sum1 = 18
Sum2 = 19
Sum3 = 20
Sum4 = 21
Func1 = 22
Func2 = 23
Func3 = 24
Func4 = 25
C = 28 /* der kleine Zaehler fuer zwischendurch */

.global xtea_enc
; == xtea_enc ==
; xtea encrytion function
; param1: 16-bit pointer to destination for encrypted block 
;  given in r25,r24
; param2: 16-bit pointer to the block (64-bit) which is to encrypt 
;  given in r23,r22
; param3: 16-bit pointer to the key (128-bit) 
;  given in r21,r20
;
xtea_enc:
 /* prolog */
 	push r2
 	push r3
 	push r4
 	push r5
 	push r6
 	push r7
 	push r8
 	push r9
 	push r14
 	push r15
 	push r16
 	push r17
 	push r28
 	
 /* load the block */
 	movw r26, r22 /* X points to block */
 	movw r30, r20 /* Z points to key   */
 	ld V01, X+
 	ld V02, X+
 	ld V03, X+
 	ld V04, X+
 	ld V11, X+
 	ld V12, X+
 	ld V13, X+
 	ld V14, X+
; 	push r25
; 	push r24
 	movw r26, r24 /* X points to destination */
 
	ldi Func1, 32
	mov r0, Func1 /* r0 is cycle-counter */
	clr Sum1
	clr Sum2
	movw Sum3, Sum1
	clt

1:
	movw Accu1, V11
	movw Accu3, V13
	ldi C, 4
2:	lsl Accu1
	rol Accu2
	rol Accu3
	rol Accu4
	dec C
	brne 2b			/* Accu == V1 << 4 */

	movw Func1, V11
	movw Func3, V13
	ldi C, 5
3:	lsr Func4
	ror Func3
	ror Func2
	ror Func1
	dec C
	brne 3b			/* Func == V1 >> 5 */
	
	eor Accu1, Func1
	eor Accu2, Func2
	eor Accu3, Func3
	eor Accu4, Func4
	add Accu1, V11
	adc Accu2, V12
	adc Accu3, V13
	adc Accu4, V14	/* Accu == ( (V1<<4)^(V1>>5) ) + V1 */
	
	brtc 4f
	mov C, Sum2
	lsr C
	andi C,(0x03 <<2)
	clt
	rjmp 5f
4:	
	mov C, Sum1	/* calc key offset */
	andi C, 0x03
	lsl C
	lsl C
	set
	
5:	
	add r30, C
	adc r31, r1
	ld  Func1, Z
	ldd Func2, Z+1
	ldd Func3, Z+2
	ldd Func4, Z+3 /* Func = key[sum & 3] */
	sub r30, C
	sbci r31, 0
	add Func1, Sum1
	adc Func2, Sum2
	adc Func3, Sum3
	adc Func4, Sum4 
	eor Accu1, Func1
	eor Accu2, Func2
	eor Accu3, Func3
	eor Accu4, Func4 /* Accu = ((V1<<4 ^ V1>>5) + V1) ^ (sum + key[sum&3])  */
	add Accu1, V01
	adc Accu2, V02
	adc Accu3, V03
	adc Accu4, V04
	
	movw V01, V11
	movw V03, V13
	movw V11, Accu1
	movw V13, Accu3
	
	/* sum += delta */ /* delta == 0x9E3779B9 */
	brtc 6f
	ldi C, 0xB9
	add Sum1, C
	ldi C, 0x79
	adc Sum2, C
	ldi C, 0x37
	adc Sum3, C
	ldi C, 0x9E
	adc Sum4, C
	rjmp 1b
	
6:	
	dec r0
	breq 7f
	rjmp 1b 
 
 7:
 /* write block back */
 ;	pop r26
 ;	pop r27
 	st X+, V01
 	st X+, V02
 	st X+, V03
  	st X+, V04
 	st X+, V11
 	st X+, V12
 	st X+, V13
 	st X+, V14
 
 /* epilog */
 	pop r28
 	pop r17
 	pop r16
 	pop r15
 	pop r14
 	pop r9
 	pop r8
 	pop r7
 	pop r6
 	pop r5
 	pop r4
 	pop r3
 	pop r2
 	ret

;####################################################################
 
 /* #endif TWO_IN_ONE */	
 
 /* #ifdef TWO_IN_ONE */
 /* now we use the same base-structure for enc- and decryption
 	to indicate operation mode we use the highest bit of param3 (16 bit pointer to key),
 	this is ok, since even the larges atmel today has "only" 8k of ram,
 	but you shouldn't use this feature while using external ram. 
 */
.global xtea_enc
 	ori r21, 0x80
 	
.global xtea_dec
; == xtea_dec ==
; xtea decrytion function
; param1: 16-bit pointer to destination for decrypted block 
;  given in r25,r24
; param2: 16-bit pointer to the block (64-bit) which is to derypt 
;  given in r23,r22
; param3: 16-bit pointer to the key (128-bit) 
;  given in r21,r20
;
/*
void xtea_dec(uint32_t* dest, uint32_t* v, uint32_t* k) {
    uint32_t v0=v[0], v1=v[1], i;
    uint32_t sum=0xC6EF3720, delta=0x9E3779B9;
    for(i=0; i<32; i++) {
        v1 -= ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
        sum -= delta;
        v0 -= ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
    }
    dest[0]=v0; dest[1]=v1;
}
*/

xtea_dec:
 /* prolog */
 	push r2
 	push r3
 	push r4
 	push r5
 	push r6
 	push r7
 	push r8
 	push r9
 	push r14
 	push r15
 	push r16
 	push r17
 	push r28 
 /* load the block */
 	movw r26, r22 /* Z points to block */
 	movw r30, r20 /* X points to key   */
 	ld V01, X+
 	ld V02, X+
 	ld V03, X+
 	ld V04, X+
 	ld V11, X+
 	ld V12, X+
 	ld V13, X+
 	ld V14, X+
 	movw r26, r24 /* Z points to destination */
 
	ldi Sum1, 32
	mov r0, Sum1 /* r1 is cycle-counter */
	ldi Sum1, 0x20 /* sum = 0xC6EF3720 */
	ldi Sum2, 0x37
	ldi Sum3, 0xEF
	ldi Sum4, 0xC6
	clt

1:
	movw Accu1, V01
	movw Accu3, V03
	ldi C, 4
2:	lsl Accu1
	rol Accu2
	rol Accu3
	rol Accu4
	dec C
	brne 2b			/* Accu == V0 << 4 */

	movw Func1, V01
	movw Func3, V03
	ldi C, 5
3:	lsr Func4
	ror Func3
	ror Func2
	ror Func1
	dec C
	brne 3b			/* Func == V0 >> 5 */
	
	eor Accu1, Func1
	eor Accu2, Func2
	eor Accu3, Func3
	eor Accu4, Func4
	add Accu1, V01
	adc Accu2, V02
	adc Accu3, V03
	adc Accu4, V04	/* Accu == ( (V0<<4)^(V0>>5) ) + V0 */
	
	brts 4f
	mov C, Sum2
	lsr C
	andi C,(0x03 <<2)
	set
	rjmp 5f
4:	
	mov C, Sum1	/* calc key offset */
	andi C, 0x03
	lsl C
	lsl C
	clt
	
5:	
	add r30, C
	adc r31, r1
	ld  Func1, Z
	ldd Func2, Z+1
	ldd Func3, Z+2
	ldd Func4, Z+3 /* Func = key[sum & 3] */
	sub r30, C
	sbci r31, 0
	add Func1, Sum1
	adc Func2, Sum2
	adc Func3, Sum3
	adc Func4, Sum4 
	eor Accu1, Func1
	eor Accu2, Func2
	eor Accu3, Func3
	eor Accu4, Func4 /* Accu = ((V0<<4 ^ V0>>5) + V0) ^ (sum + key[sum&3])  */
	sub V11, Accu1
	sbc V12, Accu2
	sbc V13, Accu3
	sbc V14, Accu4
	
	movw Accu1, V01
	movw Accu3, V03
	movw V01, V11
	movw V03, V13
	movw V11, Accu1
	movw V13, Accu3
	
	/* sum += delta */ /* delta == 0x9E3779B9 */
	brtc 6f
	subi Sum1, 0xB9
	sbci Sum2, 0x79
	sbci Sum3, 0x37
	sbci Sum4, 0x9E
	rjmp 1b
	
6:	
	dec r0
	breq 7f
	rjmp 1b 
 
7:
 /* write block back */
 	st X+, V01
 	st X+, V02
 	st X+, V03
  	st X+, V04
 	st X+, V11
 	st X+, V12
 	st X+, V13
 	st X+, V14
 
 /* epilog */
 	pop r28
 	pop r17
 	pop r16
 	pop r15
 	pop r14
 	pop r9
 	pop r8
 	pop r7
 	pop r6
 	pop r5
 	pop r4
 	pop r3
 	pop r2
 	ret
 	
 /* #endif */

;####################################################################
 
 #ifdef TWO_IN_ONE
 /* now we use the same base-structure for enc- and decryption
 	to indicate operation mode we use the highest bit of param3 (16 bit pointer to key),
 	this is ok, since even the larges atmel today has "only" 8k of ram,
 	but you shouldn't use this feature while using external ram. 
 */
.global xtea_enc
 	ori r21, 0x80
 	
.global xtea_dec
; == xtea_dec ==
; xtea decrytion function
; param1: 16-bit pointer to destination for decrypted block 
;  given in r25,r24
; param2: 16-bit pointer to the block (64-bit) which is to derypt 
;  given in r23,r22
; param3: 16-bit pointer to the key (128-bit) 
;  given in r21,r20
;
/*
void xtea_dec(uint32_t* dest, uint32_t* v, uint32_t* k) {
    uint32_t v0=v[0], v1=v[1], i;
    uint32_t sum=0xC6EF3720, delta=0x9E3779B9;
    for(i=0; i<32; i++) {
        v1 -= ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
        sum -= delta;
        v0 -= ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
    }
    dest[0]=v0; dest[1]=v1;
}
*/

xtea_dec:
 /* prolog */
 	push r2
 	push r3
 	push r4
 	push r5
 	push r6
 	push r7
 	push r8
 	push r9
 	push r14
 	push r15
 	push r16
 	push r17
 	push r28 
 /* set T-bit if we are going to encrypt, clear otherwise */
 	bst r21, 7
 	andi r21, 0x7f /* fix r21:r22 to a real addr */
 /* load the block */
 	movw r26, r22 /* Z points to block */
 	movw r30, r20 /* X points to key   */
 	ld V01, X+
 	ld V02, X+
 	ld V03, X+
 	ld V04, X+
 	ld V11, X+
 	ld V12, X+
 	ld V13, X+
 	ld V14, X+
 	movw r26, r24 /* Z points to destination */
 
	ldi Sum1, 32
	mov r0, Sum1 /* r1 is cycle-counter */
	ldi Sum1, 0x20 /* sum = 0xC6EF3720 */
	ldi Sum2, 0x37
	ldi Sum3, 0xEF
	ldi Sum4, 0xC6
	clt

1:
	movw Accu1, V01
	movw Accu3, V03
	ldi C, 4
2:	lsl Accu1
	rol Accu2
	rol Accu3
	rol Accu4
	dec C
	brne 2b			/* Accu == V0 << 4 */

	movw Func1, V01
	movw Func3, V03
	ldi C, 5
3:	lsr Func4
	ror Func3
	ror Func2
	ror Func1
	dec C
	brne 3b			/* Func == V0 >> 5 */
	
	eor Accu1, Func1
	eor Accu2, Func2
	eor Accu3, Func3
	eor Accu4, Func4
	add Accu1, V01
	adc Accu2, V02
	adc Accu3, V03
	adc Accu4, V04	/* Accu == ( (V0<<4)^(V0>>5) ) + V0 */
	
	brts 4f
	mov C, Sum2
	lsr C
	andi C,(0x03 <<2)
	set
	rjmp 5f
4:	
	mov C, Sum1	/* calc key offset */
	andi C, 0x03
	lsl C
	lsl C
	clt
	
5:	
	add r30, C
	adc r31, r1
	ld  Func1, Z
	ldd Func2, Z+1
	ldd Func3, Z+2
	ldd Func4, Z+3 /* Func = key[sum & 3] */
	sub r30, C
	sbci r31, 0
	add Func1, Sum1
	adc Func2, Sum2
	adc Func3, Sum3
	adc Func4, Sum4 
	eor Accu1, Func1
	eor Accu2, Func2
	eor Accu3, Func3
	eor Accu4, Func4 /* Accu = ((V0<<4 ^ V0>>5) + V0) ^ (sum + key[sum&3])  */
	sub V11, Accu1
	sbc V12, Accu2
	sbc V13, Accu3
	sbc V14, Accu4
	
	movw Accu1, V01
	movw Accu3, V03
	movw V01, V11
	movw V03, V13
	movw V11, Accu1
	movw V13, Accu3
	
	/* sum += delta */ /* delta == 0x9E3779B9 */
	brtc 6f
	subi Sum1, 0xB9
	sbci Sum2, 0x79
	sbci Sum3, 0x37
	sbci Sum4, 0x9E
	rjmp 1b
	
6:	
	dec r0
	breq 7f
	rjmp 1b 
 
7:
 /* write block back */
 	st X+, V01
 	st X+, V02
 	st X+, V03
  	st X+, V04
 	st X+, V11
 	st X+, V12
 	st X+, V13
 	st X+, V14
 
 /* epilog */
 	pop r28
 	pop r17
 	pop r16
 	pop r15
 	pop r14
 	pop r9
 	pop r8
 	pop r7
 	pop r6
 	pop r5
 	pop r4
 	pop r3
 	pop r2
 	ret
 	
 #endif

